# Read data
spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
# Check data
dim(spotify_songs)
## [1] 32833    23
# Let's convert into data.table for efficiency reasons
spotify_songs <- as.data.table(spotify_songs)

# Skim the data to get some sense of variable attributes
id_cols <- c('track_id', 'track_album_id', 'playlist_id')
skimr::skim(spotify_songs[, setdiff(colnames(spotify_songs), id_cols)])
Data summary
Name spotify_songs[, setdiff(c…
Number of rows 20
Number of columns 1
_______________________
Column type frequency:
factor 1
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
data 0 1 FALSE 20 aco: 1, dan: 1, dur: 1, ene: 1

I think it was definitely worth skimming the data, since it provide a nice overview of the data. With that in hand, we are able to phrase relevant/interesting questions about the spotify dataset. I am going to classify these questions into two groups:

1. Data quality:

2. Data exploration: what insights can we infer from the data? I am rather interested in two main topics.

Enough is said, let’s get to work!

Data quality

# First check missing data
spotify_songs[is.na(track_name) ]
##                  track_id track_name track_artist track_popularity
## 1: 69gRFGOWY9OMpFJgFol1u0       <NA>         <NA>                0
## 2: 5cjecvX0CmC9gK0Laf5EMQ       <NA>         <NA>                0
## 3: 5TTzhRSWQS4Yu8xTgAuq6D       <NA>         <NA>                0
## 4: 3VKFip3OdAvv4OfNTgFWeQ       <NA>         <NA>                0
## 5: 69gRFGOWY9OMpFJgFol1u0       <NA>         <NA>                0
##            track_album_id track_album_name track_album_release_date
## 1: 717UG2du6utFe7CdmpuUe3             <NA>               2012-01-05
## 2: 3luHJEPw434tvNbme3SP8M             <NA>               2017-12-01
## 3: 3luHJEPw434tvNbme3SP8M             <NA>               2017-12-01
## 4: 717UG2du6utFe7CdmpuUe3             <NA>               2012-01-05
## 5: 717UG2du6utFe7CdmpuUe3             <NA>               2012-01-05
##                    playlist_name            playlist_id playlist_genre
## 1:                       HIP&HOP 5DyJsJZOpMJh34WvUrQzMV            rap
## 2:                   GANGSTA Rap 5GA8GDo7RQC3JEanT81B3g            rap
## 3:                   GANGSTA Rap 5GA8GDo7RQC3JEanT81B3g            rap
## 4: Reggaeton viejito<U+0001F525> 0si5tw70PIgPkY1Eva6V8f          latin
## 5:                 latin hip hop 3nH8aytdqNeRbcRCg3dw9q          latin
##    playlist_subgenre danceability energy key loudness mode speechiness
## 1:  southern hip hop        0.714  0.821   6   -7.635    1      0.1760
## 2:      gangster rap        0.678  0.659  11   -5.364    0      0.3190
## 3:      gangster rap        0.465  0.820  10   -5.907    0      0.3070
## 4:         reggaeton        0.675  0.919  11   -6.075    0      0.0366
## 5:     latin hip hop        0.714  0.821   6   -7.635    1      0.1760
##    acousticness instrumentalness liveness valence   tempo duration_ms
## 1:       0.0410          0.00000   0.1160   0.649  95.999      282707
## 2:       0.0534          0.00000   0.5530   0.191 146.153      202235
## 3:       0.0963          0.00000   0.0888   0.505  86.839      206465
## 4:       0.0606          0.00653   0.1030   0.726  97.017      252773
## 5:       0.0410          0.00000   0.1160   0.649  95.999      282707

It seems like we have all the data except for attributes of the artist. This isn’t really surprise because spotify_songs data.table seems to come from a transactional database and thus only the manually adminstered data such as artist name etc.. can be missing. Let’s drop these observations

spotify_songs <- na.omit(spotify_songs)
data_dim <- dim(spotify_songs)

To check upon the granularity of the dataset, look for songs that are present more than once

# First check whether the rows are unique but I suspect it isn't the cause
dim(unique(spotify_songs)) # not it isnt't
## [1] 32828    23
# check song counts
dups <- spotify_songs[, .(song_cnt = .N), by = track_id][song_cnt > 1][order(-song_cnt)]
dups
##                     track_id song_cnt
##    1: 7BKLCZ1jbUBVqRi2FVlTVw       10
##    2: 14sOS5L36385FJ3OL8hew4        9
##    3: 3eekarcy7kvN4yt5ZFzltW        9
##    4: 6oJ6le65B3SEqPwMRNXWjY        8
##    5: 6wo37KVqFJhtuxPTpLCcfe        8
##   ---                                
## 3161: 3rwdcyPQ37SSsf1loOpux9        2
## 3162: 0rCuRc07y6l1kPYj0JSRg5        2
## 3163: 1nas007nDbzLwDGwvMdz79        2
## 3164: 0jPU39bL0SrCmYc1RwbFkX        2
## 3165: 2sApUgcQQL2m0quuYgq0ec        2
dups_sum <- sum(dups$song_cnt)

# check the one with the most presence
spotify_songs[track_id == '7BKLCZ1jbUBVqRi2FVlTVw'][
  ,c('track_name', 'track_artist', 'track_album_name', 'playlist_name', 'playlist_id')]
##                track_name     track_artist      track_album_name
##  1: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  2: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  3: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  4: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  5: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  6: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  7: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  8: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##  9: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 10: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
##                                                                                             playlist_name
##  1:                                                                                             Dance Pop
##  2:                                                                                         Post pop teen
##  3:                                                                            Electropop Hits  2017-2020
##  4:                                                                 A Loose Definition of Indie Poptimism
##  5: <U+25E4> Hip Hop Dance Music – Urban – Trap – Breaking Locking Popping Bopping – WOD – World of Dance
##  6:                                                                            Tropical House Run 190 BPM
##  7:                       2020 Hits & 2019  Hits – Top Global Tracks <U+0001F525><U+0001F525><U+0001F525>
##  8:                       2020 Hits & 2019  Hits – Top Global Tracks <U+0001F525><U+0001F525><U+0001F525>
##  9:                       2020 Hits & 2019  Hits – Top Global Tracks <U+0001F525><U+0001F525><U+0001F525>
## 10:                                                                                            2015 songs
##                playlist_id
##  1: 37i9dQZF1DWZQaaqNMbbXa
##  2: 222nc9tKxKhfZ2GBrOpwH3
##  3: 7kyvBmlc1uSqsTL0EuNLrx
##  4: 4ZO0wp9G8FA3X6oYNBzda6
##  5: 0Hr2h94pKN8QAGVAgD6BsD
##  6: 37i9dQZF1DWSTc9FdySHtz
##  7: 4JkkvMpVl4lSioqQjeAL0q
##  8: 4JkkvMpVl4lSioqQjeAL0q
##  9: 4JkkvMpVl4lSioqQjeAL0q
## 10: 6UJw1egIcZVfrBmcKs5uHH

It seems like that a track being present in more playlists is causing this. If this is the case, then track_id and playlist_id together should be the granularity of the data. Let’s prove this:

# which equals
dim(dups)[1] - dups_sum
## [1] -4476

This equals number of records - distinct track ids. This was important to check because we can be quite sure that not the song changed but it was just included in more playlists. We will need to keep this in mind when analyzing the song attributes to not to include the same song multiple times! Next up is to inspect album name lengths.

# create var
spotify_songs[, album_name_length := nchar(track_album_name)]

# viz length distribution
ggplot(spotify_songs, aes(album_name_length)) + 
  geom_histogram(colour="darkblue", fill="lightblue") +
  geom_vline(aes(xintercept=mean(album_name_length)),
    color="purple", linetype="dashed", size=1) + 
    labs(title = "ALbum name char. length distribution") +
    theme(plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# check what can you make out of 50 chars    
spotify_songs[album_name_length == 50][1:2,track_album_name]
## [1] "All My Friends (feat. Tinashe & Chance the Rapper)"
## [2] "Don't Leave Me Lonely (Purple Disco Machine Remix)"
# check what can you make out of 70 chars    
spotify_songs[album_name_length == 70][1:2, track_album_name]
## [1] "Beautiful (feat. Camila Cabello) [Bazzi vs. EDX's Ibiza Sunrise Remix]"
## [2] "South of the Border (feat. Camila Cabello & Cardi B) [Sam Feldt Remix]"
# check what can you make out of 50 chars    
spotify_songs[album_name_length == 90][1:2, track_album_name]
## [1] "Dancing Time, the Best of Eastern Nigeria's Afro Rock Exponents 1973-77 (Soundway Records)"
## [2] "Dancing Time, the Best of Eastern Nigeria's Afro Rock Exponents 1973-77 (Soundway Records)"
# check what can you make out of 50 chars    
spotify_songs[album_name_length == 110][, track_album_name]
## [1] "Hard Rock Music: Clásicos del Rock, Baladas Heavy, Música Rock Melódico Alternativo de los Anos 60's 70's 80's"
## [2] "Hard Rock Music: Clásicos del Rock, Baladas Heavy, Música Rock Melódico Alternativo de los Anos 60's 70's 80's"

Well, these seem to reasonable names. But what about playlists? They should have rather short and not too specific names. Let’s do the same for playlists.

# create var
spotify_songs[, playlist_name_length := nchar(playlist_name)]

# viz length distribution
ggplot(spotify_songs, aes(playlist_name_length)) + 
  geom_histogram(colour="darkblue", fill="lightblue") +
  geom_vline(aes(xintercept=mean(album_name_length)),
    color="purple", linetype="dashed", size=1) + 
    labs(title = "Playlist name char. number distribution") +
    theme(plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# check what can you make out of 50 chars    
spotify_songs[playlist_name_length > 50][1, playlist_name]
## [1] "Ultimate Indie Presents... Best Indie Tracks of the 2010s"
# check what can you make out of 70 chars    
spotify_songs[playlist_name_length > 100][1, unique(playlist_name)]
## [1] "Techno House 2020 <U+0001F47D> Best Collection <U+0001F47B> Top DJ’s Electronic Music - Deep House - Trance - Tech House - Dance - Electro Pop"

U0001f47d are emoji encodings, so if we want to really analyze playlist name lengths, then we should filter these out. It could be done by R’s base enc2native() but I will just leave it this way because I just wanted to see name with long lenghts.

EDA

Check how subgenres relate to genres

spotify_songs[, unique(playlist_subgenre), by=playlist_genre]
##     playlist_genre                        V1
##  1:            pop                 dance pop
##  2:            pop             post-teen pop
##  3:            pop                electropop
##  4:            pop           indie poptimism
##  5:            rap                   hip hop
##  6:            rap          southern hip hop
##  7:            rap              gangster rap
##  8:            rap                      trap
##  9:           rock                album rock
## 10:           rock              classic rock
## 11:           rock            permanent wave
## 12:           rock                 hard rock
## 13:          latin                  tropical
## 14:          latin                 latin pop
## 15:          latin                 reggaeton
## 16:          latin             latin hip hop
## 17:            r&b        urban contemporary
## 18:            r&b                   hip pop
## 19:            r&b            new jack swing
## 20:            r&b                  neo soul
## 21:            edm             electro house
## 22:            edm                  big room
## 23:            edm                   pop edm
## 24:            edm progressive electro house
##     playlist_genre                        V1

From now on, the song attributes such as tempo, loudness etc.. will be analyzed. Since songs are duplicated, let’s filter for unique songs.

songs <- unique(spotify_songs, by = 'track_id')
dim(songs)
## [1] 28352    25

Which artists/albums are the most popular? Filter for more than 20 tracks.

# plot an artist ranking
# horizontal
ggplot(songs[, .(avg_popularity = mean(track_popularity),
          pop_sd = sd(track_popularity),
          track_count = .N
          ) , by = track_artist][track_count > 20][order(-avg_popularity)][1:10,]
) +
  geom_bar( aes(x=reorder(track_artist, avg_popularity), y=avg_popularity), stat="identity", fill="skyblue", alpha=0.5) +
  geom_errorbar( aes(x=track_artist, 
                     ymin=avg_popularity-pop_sd, 
                     ymax=avg_popularity+pop_sd), width=0.4, colour="orange", alpha=0.9, size=1.3) +
  coord_flip() +
  labs(title = 'Top 10 artists', x = 'artists')

The sd range is big due to the small number of observations. Do the same for albums. Here I think gotta be more cautios because I suspect album names to be not unique for different aritsts.

# get best albums by popularity only consider albums w. at least 5 tracks
fav_albums <- songs[, .(avg_popularity = mean(track_popularity),
          pop_sd = sd(track_popularity),
          track_count = .N
          ) , by = track_album_id][order(-avg_popularity)][track_count > 5][1:10,]

# merge to get artist names
fav_albums <- merge(fav_albums, 
      unique(songs, by = c('track_album_id', 'track_album_name', 'track_artist'))
      [, c('track_album_id', 'track_album_name', 'track_artist')], 
      all.x =T, all.y = F, by = 'track_album_id')

# create var to display album name + performer
fav_albums[, artist_album:= do.call(paste, c(.SD, sep = " - ")), 
           .SDcols=c('track_artist', 'track_album_name')] 

ggplot(fav_albums) +
  geom_bar( aes(x=reorder(artist_album, avg_popularity), y=avg_popularity), stat="identity",
            fill="skyblue", alpha=0.5) +
  coord_flip() +
  labs(title = 'Top 10 artists', x = 'artists')

I am really interested whether the playlist genres really group the alike songs. I consider alike songs that have their attributes like energy and loudness in a similar range. To do this, let’s apply PCA and cluster the songs and then check whether the clusters capture the genres.

library(caret)
## Loading required package: lattice
library(NbClust)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
pca_vars <- colnames(songs) 
# extract numeric vars
pca_vars <- pca_vars[!(startsWith(pca_vars, 'track') |  startsWith(pca_vars, 'playlist') |
                       startsWith(pca_vars, 'album'))]

pca_result <- prcomp(songs[, pca_vars, with=F], scale. = T)
(pca_result$rotation)[, seq(1:5)]
##                           PC1         PC2          PC3         PC4          PC5
## danceability     -0.038589823  0.59561843 -0.016516675  0.31435332  0.119081422
## energy            0.613189473 -0.02732757  0.015534194  0.09168641  0.030901600
## key               0.006032126  0.07712829  0.678512348 -0.01394207 -0.075493284
## loudness          0.554175947  0.08058271 -0.039621211 -0.05993675  0.120719682
## mode             -0.009993168 -0.13621189 -0.677135004  0.06782140 -0.092004567
## speechiness       0.001006913  0.35511364  0.075934593 -0.45391938 -0.005170522
## acousticness     -0.490747777  0.04576793 -0.038233534 -0.23650100 -0.018503291
## instrumentalness -0.072798645 -0.29053929  0.194321709  0.36870109  0.406964990
## liveness          0.162448199 -0.12713977  0.065513510 -0.37241774 -0.572072811
## valence           0.103545428  0.52332705 -0.120740090  0.19591640 -0.217598825
## tempo             0.178891130 -0.22515211 -0.007943339 -0.31801744  0.291021579
## duration_ms      -0.003888301 -0.24526149  0.122894873  0.45635483 -0.575176207
fviz_pca(pca_result, label = F, alpha = 0.2)

Well there is a lot of overlap. Lets color the songs (dots) by genre, we might explore patterns

# create a data table from the pca result
pca_songs <- as.data.table(pca_result$x)
# assign rows as ids
pca_songs[, id:=row.names(pca_songs)]
# do the same for the `original` dataset
songs[, id:=row.names(songs)]


ggplot(merge(pca_songs, songs[, c('id', 'playlist_genre')], 
      all.x = T, all.y = T, by = 'id'), 
      aes(PC1, PC2, color = playlist_genre)) +
  geom_point(alpha = 0.2)

Well, there is a lot of data points and a lot of overlap. If I may try hard enough, I may find some patterns between the genres but just by looking at this graph, it is nearly impossible. We would be better of if we analyzed this problem analytically, for example clustering in this PCA space and calculate for example dissimilarity scores between the clusters. But that would be out of the scope of this assignment.

Let’s make racing bars based on how many songs were made per year per subgenre. This page was really helpful when doings this: link

# create a Year col
songs[, Y := format(as.Date(track_album_release_date), "%Y") ]
# select relevant cols
data_racing <- songs[, c('Y', 'playlist_subgenre', 'track_id')]
data_racing <- na.omit(data_racing)  # needed for the ggplot

data_racing <- data_racing[, .(cnt = .N), by = c('Y', 'playlist_subgenre' ) ]
setorderv(data_racing, c('Y', 'cnt'))
# cumulative count
data_racing[, cnt_cum := cumsum(cnt), by = c( 'playlist_subgenre')]
# not necessary col
data_racing[,cnt:=NULL]

data_racing[order(c('YM', 'playlist_subgenre'))]
##       Y  playlist_subgenre cnt_cum
## 1: 1957 urban contemporary       1
## 2: 1958       classic rock       1
# check if this works as expected
data_racing[playlist_subgenre == 'album rock'][1:8]
##       Y playlist_subgenre cnt_cum
## 1: 1963        album rock       1
## 2: 1964        album rock       3
## 3: 1965        album rock       4
## 4: 1966        album rock       7
## 5: 1967        album rock      13
## 6: 1968        album rock      17
## 7: 1969        album rock      28
## 8: 1970        album rock      49

I really prefer pivoting data.frames and not data.tables

library(tidyr)
library(tidyverse)
# only to get the 'full' range of year for example there are years for which there was no tracks released in a genre
data_racing <- pivot_wider(data_racing, names_from = Y, values_from = cnt_cum )
data_racing <- pivot_longer(data_racing, -playlist_subgenre, names_to = "Y", values_to = "album_cumsum")

# forward fill values. If there was no track in 1970 in a genre then it will be imputed with the last value
data_racing <- data_racing %>% 
  group_by(playlist_subgenre) %>% 
  fill(album_cumsum)

# shape for ggplot
data_racing <- data_racing %>% 
  group_by(Y) %>% 
  arrange(-album_cumsum) %>% 
  mutate(rank = row_number()) %>% 
  filter( rank <= 10) %>% 
  arrange(Y)
library(gganimate)
gen_subgenre <- unique(songs[, c('playlist_genre', 'playlist_subgenre')])
data_racing <- merge(data_racing, gen_subgenre, by = 'playlist_subgenre', all.x = T, all.y = F)
data_racing$Y <- as.numeric(data_racing$Y)
# write_rds(racing_rds, 'racing.rds')

p <- data_racing %>%
  ggplot(aes(x=-rank, y=album_cumsum, fill = playlist_genre)) +
  geom_tile(aes(y = album_cumsum/2, height=album_cumsum),width=0.9)+
  geom_text(aes(label=playlist_subgenre),
            hjust="right",
            colour="black",
            fontface="bold",
            nudge_y=-100)+
  geom_text(aes(label=scales::comma(album_cumsum)),
            hjust="left",
            nudge_y=50,
            colour="grey30")+
  theme_minimal() +
  coord_flip(clip="off") +
  scale_x_discrete("") +
  scale_y_continuous("",labels=scales::comma)+
  scale_fill_brewer(palette = 'RdYlBu') +
  theme(panel.grid.major.y=element_blank(),
        panel.grid.minor.x=element_blank(),
        plot.title= element_text(size=20,colour="grey50",face="bold"),
        plot.caption = element_text(colour="grey50"),
        plot.subtitle = element_text(size=20,colour="grey50",face="bold"),
        plot.margin = margin(1,1,1,2,"cm"),
        axis.text.y=element_blank())+
  transition_time(Y) +
  labs(title='Number of tracks released in {round(frame_time,0)}',
       caption='made by Gerold')

# I couldn't make this work so in an rmdown so I animated and saved the gif in an R script
# rendering a video would help solving this but would have taken too much time to deal with this
#animate(p, duration = 30, fps = 25, end_pause = 20) 
# anim_save("racing-bars.gif")